#Background Information on Data Set
About Dataset Concerns housing values in suburbs of Boston.
Number of Instances: 506
Number of Attributes: 13 continuous attributes (including “class” attribute “MEDV”), 1 binary-valued attribute.
Attribute Information:
// CRIM per capita crime rate by town / ZN proportion of residential land zoned for lots over 25,000 sq.ft. //INDUS proportion of non-retail business acres per town CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) / NOX nitric oxides concentration (parts per 10 million) / RM average number of rooms per dwelling //AGE proportion of owner-occupied units built prior to 1940 //DIS weighted distances to five Boston employment centres //RAD index of accessibility to radial highways / TAX full-value property-tax rate per $10,000 //PTRATIO pupil-teacher ratio by town B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town /LSTAT % lower status of the population MEDV Median value of owner-occupied homes in $1000’s Missing Attribute Values: None.
#Importation of Data Set
realEstateraw <- read.csv("Real_Estate.csv")
str(realEstateraw)
## 'data.frame': 511 obs. of 14 variables:
## $ CRIM : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ ZN : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ INDUS : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ CHAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOX : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ RM : num 6.58 6.42 7.18 7 7.15 ...
## $ AGE : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ DIS : num 4.09 4.97 4.97 6.06 6.06 ...
## $ RAD : int 1 2 2 3 3 3 5 5 5 5 ...
## $ TAX : int 296 242 242 222 222 222 311 311 311 311 ...
## $ PTRATIO: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ B : num 397 397 393 395 397 ...
## $ LSTAT : num 4.98 9.14 4.03 2.94 5.33 ...
## $ MEDV : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
library(corrplot, quietly = TRUE)
## corrplot 0.92 loaded
library(car, quietly = TRUE)
library(addAlpha, quietly = TRUE)
library(RColorBrewer, quietly = TRUE) #This package includes the colorblind friendly colors that we will be using.
which(is.na(realEstateraw))
## [1] 2566 2591 2619 2652 2691
rownames(realEstateraw[is.na(realEstateraw),])
## [1] "NA" "NA.1" "NA.2" "NA.3" "NA.4"
realEstate <- na.omit(realEstateraw)
realEstate$LOGLSTAT <- log(realEstate$LSTAT +1)
realEstate$PTRATIOLOG <- log10(realEstate$PTRATIO +1)
realEstate$CRIMELOG <- log10(realEstate$CRIM +1)
realEstateColsansMEDV <- colnames(realEstate)[! colnames(realEstate) %in% c("MEDV")]
correlationMatrix <- cor(realEstate[, c("MEDV", realEstateColsansMEDV)])
colnames(correlationMatrix) <- c("MEDV", realEstateColsansMEDV)
rownames(correlationMatrix) <- c("MEDV", realEstateColsansMEDV)
divergingColours <- hcl.colors(256, palette = "BrBG")
corrplot(correlationMatrix,
method = "circle",
type = "upper",
col = divergingColours,
diag = F,
outline = T,
tl.col = rgb(0,0,0))
realEstateColsansMEDV <- colnames(realEstate)[! colnames(realEstate) %in% c("MEDV", "CHAS", "CRIM","CRIMELOG",
"TAX","DIS",
"ZN","AGE","NOX","LSTAT",
"B","PTRATIO","RAD","CRIM","INDUS")]
predictorColors <- brewer.pal(8,"Set2")
#It is time for multiple linear regression, as we are not able to pass assumptions for linear regression for any individual predictor variable.
ogVal <- c("RM","PTRATIO","LSTAT","CRIM")
ogPretty <-c("Averge Number of Rooms Per Dwelling",
"Pupil-Teacher Ratio",
"Lower Status of Population as Percentage",
"Crime Rate Per Capita")
predVals <- ogVal
predVals_pretty <- ogPretty
par(mfcol=c(2,2), omi = c(.3,.75,.3,.3), mai = c(.5,.5,.5,.5))
for( i in (1:length(ogVal)))
{
plot(realEstate[,ogVal[i]], realEstate[,'MEDV'],
ann= F,
axes= F,
col = predictorColors[i],
lwd = 3.5,
ylim = range(pretty(range(realEstate[,'MEDV']))),
xlim = range(pretty(range(realEstate[,ogVal[i]]))))
mtext(predVals_pretty[i], side = 3, las =1, adj = .1, font = 2, col = predictorColors[i], cex = 1)
axis(2, at = pretty(range(realEstate[,'MEDV'])), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(realEstate[,'MEDV'])),side = 2,at = pretty(realEstate[,'MEDV']), line = 1, lwd = 1, las = 1)
if(i == 1 | i == 2){
mtext("Median Value\nof Owner-Occupied Homes\n($1000 USD)", side =2 , line = 2.75, font = 2)
}
axis(1, at = pretty(range(realEstate[,ogVal[i]])), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(realEstate[,ogVal[i]])), at = pretty(range(realEstate[,ogVal[i]])),side =1 , line = 1)
}
predVals <-realEstateColsansMEDV
predVals_pretty <- c("Average Number of\n Rooms per dwelling","Log Percentage of Lower\n Status Of The\n Population","Log Pupil-Teacher\n Ratio By Town")
par(mfcol=c(2,2), omi = c(.3,.75,.3,.3), mai = c(.5,.5,.5,.5))
for( i in (1:length(realEstateColsansMEDV)))
{
plot(realEstate[,realEstateColsansMEDV[i]], realEstate[,'MEDV'],
ann= F,
axes= F,
col = predictorColors[i],
lwd = 3.5,
ylim = range(pretty(range(realEstate[,'MEDV']))),
xlim = range(pretty(range(realEstate[,realEstateColsansMEDV[i]]))))
mtext(predVals_pretty[i], side = 3, las =1, adj = .1, font = 2, col = predictorColors[i], cex = 1)
axis(2, at = pretty(range(realEstate[,'MEDV'])), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(realEstate[,'MEDV'])),side = 2,at = pretty(realEstate[,'MEDV']), line = 1, lwd = 1, las = 1)
if(i == 1 | i == 2){
mtext("Median Value\nof Owner-Occupied Homes\n($1000 USD)", side =2 , line = 2.75, font = 2)
}
axis(1, at = pretty(range(realEstate[,realEstateColsansMEDV[i]])), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(realEstate[,realEstateColsansMEDV[i]])), at = pretty(range(realEstate[,realEstateColsansMEDV[i]])),side =1 , line = 1)
}
plot(realEstate[,'CRIMELOG'], realEstate[,'MEDV'],
ann= F,
axes= F,
col = predictorColors[4],
lwd = 3.5,
ylim = range(pretty(range(realEstate[,'MEDV']))),
xlim = range(pretty(range(realEstate[,'CRIMELOG']))))
mtext("Log Crime Rate Per Capita", side = 3, las =1, adj = .1, font = 2, col = predictorColors[4], cex = 1)
axis(2, at = pretty(range(realEstate[,'MEDV'])), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(realEstate[,'MEDV'])),side = 2,at = pretty(realEstate[,'MEDV']), line = 1, lwd = 1, las = 1)
axis(1, at = pretty(range(realEstate[,'CRIMELOG'])) , lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(realEstate[,"CRIMELOG"])), at = pretty(range(realEstate[,'CRIMELOG'])),side =1 , line = 1)
predVals <-realEstateColsansMEDV
predVals_pretty <- c("Average Number of\n Rooms per dwelling","Log Percentage of Lower\n Status Of The\n Population","Log Pupil-Teacher\n Ratio By Town")
par(mfcol = c(1,length(predVals)), mai = c(.1,.5,.1,.5), omi = c(.25,.25,.25,.25))
for (i in 1:length(predVals)){
if (i != 1 & i < length(predVals)){
yLim <- range(pretty(range(log(realEstate[, predVals[i]]+1))))
boxplot(log(realEstate[, predVals[i]]+1), axes = FALSE, ylim = yLim, col = rgb(1,1,1), border = predictorColors[i], lwd = 2)
} else {
yLim <- range(pretty(range(realEstate[, predVals[i]])))
boxplot(realEstate[, predVals[i]], axes = FALSE, ylim = yLim, col = rgb(1,1,1), border = predictorColors[i], lwd = 2)
}
axis(2, at = pretty(yLim), lwd = 2, tck = -.025, labels = FALSE)
mtext(pretty(yLim), 2, at = pretty(yLim), line = .75, las = 1)
mtext(predVals_pretty[i],3, font=2, cex = .8, line = -1.5)
}
Original Model Predictor Variables
predVals <-c(realEstateColsansMEDV, "CRIMELOG")
predVals_pretty <- c("Average Number of\n Rooms per dwelling","Percentage of Lower\n Status Of The\n Population","Pupil-Teacher\n Ratio By Town", "Log Crime Rate\nPer-Capita")
par(mfcol = c(1,length(predVals)), mai = c(.1,.5,.1,.5), omi = c(.25,.25,.25,.25))
for (i in 1:length(predVals)){
if (i != 1 & i < length(predVals)){
yLim <- range(pretty(range(log(realEstate[, predVals[i]]+1))))
boxplot(log(realEstate[, predVals[i]]+1), axes = FALSE, ylim = yLim, col = rgb(1,1,1), border = predictorColors[i], lwd = 2)
} else {
yLim <- range(pretty(range(realEstate[, predVals[i]])))
boxplot(realEstate[, predVals[i]], axes = FALSE, ylim = yLim, col = rgb(1,1,1), border = predictorColors[i], lwd = 2)
}
axis(2, at = pretty(yLim), lwd = 2, tck = -.025, labels = FALSE)
mtext(pretty(yLim), 2, at = pretty(yLim), line = .75, las = 1)
mtext(predVals_pretty[i],3, font=2, cex = .8, line = -1.5)
}
Augmented Model Predictor Variables
predVals <- c(realEstateColsansMEDV)
predVals_pretty <- c("Average Number of\n Rooms per dwelling","Percentage of Lower\n Status Of The\n Population","Pupil-Teacher\n Ratio By Town")
predictorColorsT <- add.alpha(predictorColors, .9)
out76 <- hist(realEstate$MEDV, plot = F)
yticks <- pretty(c(0,out76$count))
ylimit <- range(yticks)
hist(realEstate$MEDV, ylim = ylimit, axes = F, ann = F, col = predictorColorsT[7])
axis(1, at = out76$breaks, lwd =2, tck = -.01, labels = F)
axis(2, at = yticks, lwd = 2, tck = -.01, labels = F)
mtext(out76$breaks, side = 1, at = out76$breaks, line = .3)
mtext(yticks , side = 2, at = yticks, line =.3, las = 1)
mtext("Frequency", side = 2, line = 1.75, font = 2)
mtext("Median House Value ($1000 USD)", side = 1, line = 1.5, font = 2)
mtext("Outcome Variable Distribution", col = predictorColors[7], font = 2, adj = -.0001)
correlationMatrix <- cor(realEstate[, c("MEDV", predVals)])
colnames(correlationMatrix) <- c("MEDV", predVals_pretty)
rownames(correlationMatrix) <- c("MEDV", predVals_pretty)
divergingColours <- hcl.colors(256, palette = "Cork")
corrplot(correlationMatrix,
method = "ellipse",
type = "upper",
col = divergingColours,
diag = F,
outline = T,
tl.col = rgb(.4,.4,.4),
tl.offset = .5)
par(omi = c(.25,.5,.25,.25))
correlationMatrix <- cor(realEstate[, c("MEDV", predVals, "CRIMELOG")])
colnames(correlationMatrix) <- c("Median Value\nof Owner-Occupied Homes\n($1000 USD)", predVals_pretty, "Log Crime Rate\nPer-Capita")
rownames(correlationMatrix) <- c("Median Value\nof Owner-Occupied Homes\n($1000 USD)", predVals_pretty, "Log Crime Rate\nPer-Capita")
divergingColours <- hcl.colors(256, palette = "Cork")
corrplot(correlationMatrix,
method = "ellipse",
type = "upper",
col = divergingColours,
diag = F,
outline = T,
tl.col = rgb(.4,.4,.4),
tl.offset = .5)
correlationMatrix <- cor(realEstate[, c("MEDV", predVals, "CRIMELOG")])
colnames(correlationMatrix) <- c("Median Value\nof Owner-Occupied Homes\n($1000 USD)", predVals_pretty, "Log Crime Rate\nPer-Capita")
rownames(correlationMatrix) <- c("Median Value\nof Owner-Occupied Homes\n($1000 USD)", predVals_pretty, "Log Crime Rate\nPer-Capita")
divergingColours <- hcl.colors(256, palette = "Cork")
corrplot(correlationMatrix,
method = "ellipse",
type = "upper",
col = divergingColours,
diag = F,
outline = T,
tl.col = rgb(.4,.4,.4),
tl.offset = .5)
set.seed(21)
trainIDX <- sample(x = c(TRUE, FALSE), size = dim(realEstate)[1], replace = TRUE, prob = c(.75, .25))
trainingData <- realEstate[trainIDX,]
testingData <- realEstate[!trainIDX,]
originalModel <- lm(MEDV ~ RM + LOGLSTAT+ PTRATIOLOG, data = trainingData)
theOutput <- summary(originalModel)
theOutput
##
## Call:
## lm(formula = MEDV ~ RM + LOGLSTAT + PTRATIOLOG, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.288 -3.268 -1.029 1.907 39.936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.6966 9.0358 6.607 1.35e-10 ***
## RM 3.5441 0.5257 6.741 5.94e-11 ***
## LOGLSTAT -9.2504 0.7151 -12.936 < 2e-16 ***
## PTRATIOLOG -28.0676 6.1657 -4.552 7.18e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.587 on 375 degrees of freedom
## Multiple R-squared: 0.6579, Adjusted R-squared: 0.6552
## F-statistic: 240.4 on 3 and 375 DF, p-value: < 2.2e-16
qqCol <- brewer.pal(12, "Paired")
qqColT <- add.alpha(qqCol)
n <- qqnorm(theOutput$residuals,
ann = F,
axes = F,
col = qqColT[2],
lwd = 3.5,
xlim = range(pretty(range(-3,3))),
ylim = range(pretty(range(theOutput$residuals)))
)
qqline(theOutput$residuals,
ann = F,
col = qqCol[2],
lwd = 3.5
)
mtext("Original Model QQ Plot", side = 3, las =1, adj = .1, font = 2, col = qqCol[1], cex = 1)
axis(2, at = pretty(range(n$y)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(n$y)),side = 2,at = pretty(range(n$y)), line = 1, lwd = 1, las = 1)
mtext("Sample Quantiles", side =2 , line = 2.75, font = 2)
axis(1, at = pretty(range(-3,3)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(-3,3)), at = pretty(range(-3,3)),side =1 , line = 1)
mtext("Theoretical Quantiles", side =1 , line = 2.75, font = 2)
originalVif <- data.frame(vif(originalModel))
colnames(originalVif) <- c("VIF Scores")
rownames(originalVif) <- c("Average Number of Rooms per Dwelling", "Log Percentage of Lower Status", "Pupil-Teacher Ratio")
plot(trainingData$MEDV, theOutput$residuals,
ann = F,
axes = F,
col = qqColT[2],
lwd = 4.5,
xlim = range(pretty(range(trainingData$MEDV))),
ylim = range(pretty(range(theOutput$residuals)))
)
mtext("Original Model Homoscedasticity Plot", side = 3, las =1, adj = .1, font = 2, col = qqCol[2], cex = 1)
axis(2, at = pretty(range(theOutput$residuals)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(theOutput$residuals)),side = 2,at = pretty(range(theOutput$residuals)), line = 1, lwd = 1, las = 1)
mtext("Original Model Residuals", side =2 , line = 2.75, font = 2)
axis(1, at = pretty(range(trainingData$MEDV)), pos = 0, lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(trainingData$MEDV)), at = pretty(trainingData$MEDV),side =1 , line = -8)
mtext("Median Value of Owner Occupied Homes ($1000USD)", side = 1, line = -1.0, font = 2)
#We now repeat process with the extra variable crime
secondModel <- lm(MEDV ~ RM + LOGLSTAT+ PTRATIOLOG + CRIMELOG, data = trainingData)
theSecondOutput <- summary(secondModel)
theSecondOutput
##
## Call:
## lm(formula = MEDV ~ RM + LOGLSTAT + PTRATIOLOG + CRIMELOG, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.369 -3.198 -1.096 1.958 39.749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.7744 9.3522 6.285 9.16e-10 ***
## RM 3.5660 0.5293 6.737 6.13e-11 ***
## LOGLSTAT -9.1067 0.8058 -11.302 < 2e-16 ***
## PTRATIOLOG -27.6478 6.2665 -4.412 1.34e-05 ***
## CRIMELOG -0.3090 0.7952 -0.389 0.698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.593 on 374 degrees of freedom
## Multiple R-squared: 0.6581, Adjusted R-squared: 0.6544
## F-statistic: 180 on 4 and 374 DF, p-value: < 2.2e-16
qqCol <- brewer.pal(12, "Paired")
n <- qqnorm(theSecondOutput$residuals,
ann = F,
axes = F,
col = qqColT[8],
lwd = 3.5,
xlim = range(pretty(range(-3,3))),
ylim = range(pretty(range(theSecondOutput$residuals)))
)
qqline(theSecondOutput$residuals,
ann = F,
col = qqCol[8],
lwd = 3.5
)
mtext("Augmented Model QQ Plot", side = 3, las =1, adj = .1, font = 2, col = qqCol[7], cex = 1)
axis(2, at = pretty(range(n$y)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(n$y)),side = 2,at = pretty(range(n$y)), line = 1, lwd = 1, las = 1)
mtext("Sample Quantiles", side =2 , line = 2.75, font = 2)
axis(1, at = pretty(range(-3,3)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(-3,3)), at = pretty(range(-3,3)),side =1 , line = 1)
mtext("Theoretical Quantiles", side =1 , line = 2.75, font = 2)
secondVif <- data.frame(vif(secondModel))
colnames(secondVif) <- c("VIF Scores")
rownames(secondVif) <- c("Average Number of Rooms per Dwelling", "Log Percentage of Lower Status", "Pupil-Teacher Ratio", "Log of Crime Rate Per Capita")
originalVif <- rbind(originalVif, NA)
rownames(originalVif) <- rownames(secondVif)
secondVif
## VIF Scores
## Average Number of Rooms per Dwelling 1.733525
## Log Percentage of Lower Status 2.339043
## Pupil-Teacher Ratio 1.289089
## Log of Crime Rate Per Capita 1.524919
totalVif <- cbind(originalVif, secondVif)
colnames(totalVif) <- c("Original Model VIF Scores", "Augmented Model VIF Scores")
totalVif
## Original Model VIF Scores
## Average Number of Rooms per Dwelling 1.713831
## Log Percentage of Lower Status 1.846331
## Pupil-Teacher Ratio 1.250782
## Log of Crime Rate Per Capita NA
## Augmented Model VIF Scores
## Average Number of Rooms per Dwelling 1.733525
## Log Percentage of Lower Status 2.339043
## Pupil-Teacher Ratio 1.289089
## Log of Crime Rate Per Capita 1.524919
plot(trainingData$MEDV, theSecondOutput$residuals,
ann = F,
axes = F,
col = qqColT[8],
lwd = 4.5,
xlim = range(pretty(range(trainingData$MEDV))),
ylim = range(pretty(range(theSecondOutput$residuals)))
)
mtext("Augmented Model Homoscedasticity Plot", side = 3, las =1, adj = .1, font = 2, col = qqCol[8], cex = 1)
axis(2, at = pretty(range(theSecondOutput$residuals)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(theSecondOutput$residuals)),side = 2,at = pretty(range(theSecondOutput$residuals)), line = 1, lwd = 1, las = 1)
mtext("Augmented Model Residuals", side =2 , line = 2.75, font = 2)
axis(1, at = pretty(range(trainingData$MEDV)), pos = 0, lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(trainingData$MEDV)), at = pretty(trainingData$MEDV),side =1 , line = -8)
mtext("Median Value of Owner Occupied Homes ($1000USD)", side = 1, line = -1.0, font = 2)
N*lnσ+2k
#Must Redo this thing right here with the testing dta and not with the original model
ogYPrediction <- predict(originalModel, newdata = testingData, type = "response")
sgYPrediction <- predict(secondModel, newdata = testingData, type = "response")
ogSSE <- sum((testingData$MEDV- ogYPrediction)**2)
sgSSE <- sum((testingData$MEDV - sgYPrediction)**2)
ogSSR <- sum((ogYPrediction - mean(testingData$MEDV))**2)
sgSSR <- sum((sgYPrediction - mean(testingData$MEDV))**2)
ogR2 <- (ogSSR/(ogSSR + ogSSE))
sgR2 <- (sgSSR/(sgSSR + sgSSE))
ogAIC <- dim(testingData)[1]*(log(ogSSE/dim(testingData)[1]) + 2*3)
sgAIC <- dim(testingData)[1]*(log(sgSSE/dim(testingData)[1]) + 2*4)
par(mfrow = c(1,2))
plot(ogYPrediction, testingData$MEDV,
ann=F,
axes = F,
col = qqColT[2],
lwd = 3.5,
xlim = range(pretty(range(ogYPrediction))),
ylim = range(pretty(range(testingData$MEDV)))
)
ogr2text <- ((paste("R^2" ,' = ', signif(ogR2,4))))
ogAICtext <- paste("AIC = ", signif(ogAIC,));
legend("topright", legend = c(ogr2text,ogAICtext), bty = "n", title.col = qqCol[8] )
mtext("Original Model Predictions vs Actual", side = 3, las =1, font = 2, col = qqCol[2], cex = 1)
axis(1 , at = pretty(range(ogYPrediction)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(ogYPrediction)),side = 1,at = pretty(range(ogYPrediction)), line = 1, lwd = 1, las = 1)
mtext("Predicted MEDV Values", side =1 , line = 2.75, font = 2)
axis(2 , at = pretty(range(testingData$MEDV)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(testingData$MEDV)),side = 2,at = pretty(range(testingData$MEDV)), line = 1, lwd = 1, las = 1)
mtext("Actual MEDV Values", side =2 , line = 2.75, font = 2)
plot(sgYPrediction, testingData$MEDV,
ann=F,
axes = F,
col = qqColT[8],
lwd = 3.5,
xlim = range(pretty(range(sgYPrediction))),
ylim = range(pretty(range(testingData$MEDV)))
)
sgr2text <- paste("r2, = ",signif(sgR2,4))
sgAICtext <- paste("AIC = ", signif(sgAIC,4));
legend("topright", legend = c(sgr2text,sgAICtext), bty = "n", title.col = qqCol[8] )
mtext("Augmented Model Predictions vs Actual", side = 3, las =1, font = 2, col = qqCol[8], cex = 1)
axis(1 , at = pretty(range(sgYPrediction)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(sgYPrediction)),side = 1,at = pretty(range(sgYPrediction)), line = 1, lwd = 1, las = 1)
mtext("Predicted MEDV Values", side =1 , line = 2.75, font = 2)
axis(2 , at = pretty(range(testingData$MEDV)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(testingData$MEDV)),side = 2,at = pretty(range(testingData$MEDV)), line = 1, lwd = 1, las = 1)
mtext("Actual MEDV Values", side =2 , line = 2.75, font = 2)
Generate New Training Data and Testing Data
# ogVal
# ogPretty
set.seed(22)
trainIDX <- sample(x = c(TRUE, FALSE), size = dim(realEstate)[1], replace = TRUE, prob = c(.75, .25))
trainingData <- realEstate[trainIDX,]
testingData <- realEstate[!trainIDX,]
Create Linear Models
lstatPM_1 <- lm(trainingData$MEDV ~ trainingData$LSTAT)
ptratioPM_1 <- lm(trainingData$MEDV ~ trainingData$PTRATIO)
rmPM_1 <- lm(trainingData$MEDV ~ trainingData$RM)
crimePM_1 <- lm(trainingData$MEDV ~ trainingData$CRIM)
Create Polynomial Models (DEGREE 2)
lstatPM_2 <- lm(trainingData$MEDV ~ poly(trainingData$LSTAT,2))
ptratioPM_2 <- lm(trainingData$MEDV ~ poly(trainingData$PTRATIO,2))
rmPM_2 <- lm(trainingData$MEDV ~ poly(trainingData$RM,2))
crimePM_2 <- lm(trainingData$MEDV ~ poly(trainingData$CRIM,2))
Create Polynomial Models (DEGREE 3)
lstatPM_3 <- lm(trainingData$MEDV ~ poly(trainingData$LSTAT,3))
ptratioPM_3 <- lm(trainingData$MEDV ~ poly(trainingData$PTRATIO,3))
rmPM_3 <- lm(trainingData$MEDV ~ poly(trainingData$RM,3))
crimePM_3 <- lm(trainingData$MEDV ~ poly(trainingData$CRIM,3))
Define MakeQQ Function
makeqq <- function(theOutput, title_, colNum, supressCol = F, supressRow = F){
n <- qqnorm(theOutput$residuals,
ann = F,
axes = F,
col = qqColT[colNum],
lwd = 3.5,
xlim = range(pretty(range(-3,3))),
ylim = range(pretty(range(theOutput$residuals)))
)
qqline(theOutput$residuals,
ann = F,
col = qqCol[colNum],
lwd = 3.5
)
mtext(title_, side = 3, las =1, adj = .1, font = 2, col = qqCol[colNum], cex = 1)
axis(2, at = pretty(range(n$y)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(n$y)),side = 2,at = pretty(range(n$y)), line = 1, lwd = 1, las = 1)
if(!supressCol){
mtext("Sample Quantiles", side =2 , line = 2.75, font = 2)
}
axis(1, at = pretty(range(-3,3)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(-3,3)), at = pretty(range(-3,3)),side =1 , line = 1)
if(!supressRow)
{
mtext("Theoretical Quantiles", side =1 , line = 2.75, font = 2)
}
}
QQ Plots for Linear Models
par(mfcol = c(2,2), omi = c(.5,.5,.5,.5))
makeqq(lstatPM_1, "QQ Plot for LSTAT\nLinear Model", 1, supressRow = T)
makeqq(ptratioPM_1, "QQ Plot for Pupil-Teacher Ratio\nLinear Model", 3)
makeqq(rmPM_1, "QQ Plot for Rooms per Dwelling\nLinear Model", 5, supressCol = T, supressRow = T)
makeqq(crimePM_1, "QQ Plot for Crime Per Capita\nLinear Model", 7, supressCol = T)
QQ Plots for Quadratic Models
par(mfcol = c(2,2), omi = c(.5,.5,.5,.5))
makeqq(lstatPM_2, "QQ Plot for LSTAT\nPolynomial Degree 2 Model", 4, supressRow = T)
makeqq(ptratioPM_2, "QQ Plot for Pupil-Teacher Ratio\nPolynomial Degree 2 Model", 6)
makeqq(rmPM_2, "QQ Plot for Rooms per Dwelling\nPolynomial Degree 2 Model", 8, supressCol = T, supressRow = T)
makeqq(crimePM_2, "QQ Plot for Crime Per Capita\nPolynomial Degree 2 Model", 10, supressCol = T)
QQ Plots for Cubic Models
par(mfcol = c(2,2), omi = c(.5,.5,.5,.5))
makeqq(lstatPM_3, "QQ Plot for LSTAT\nPolynomial Degree 3 Model", 5, supressRow = T)
makeqq(ptratioPM_3, "QQ Plot for Pupil-Teacher Ratio\nPolynomial Degree 3 Model", 9)
makeqq(rmPM_3, "QQ Plot for Rooms per Dwelling\nPolynomial Degree 3 Model", 7, supressCol = T, supressRow = T)
makeqq(crimePM_3, "QQ Plot for Crime Per Capita\nPolynomial Degree 3 Model", 1, supressCol = T)
#From this point, we begin to take the final Multiple Polynomial
Regression Models
Make multiple linear/quadratic/cubic models
degree_3_original_model <- lm(MEDV ~ poly(LSTAT, 3) + poly(RM, 3) + poly(PTRATIO, 3), data = trainingData)
degree_2_original_model <- lm(MEDV ~ poly(LSTAT, 2) + poly(RM, 2) + poly(PTRATIO, 2), data = trainingData)
degree_1_original_model <- lm(MEDV ~ poly(LSTAT, 1) + poly(RM, 1) + poly(PTRATIO, 1), data = trainingData)
par(mfcol = c(1,3), omi = c(.5,.5,.5,.5))
makeqq(degree_1_original_model,"Original Linear Model QQ Plot" , 2)
makeqq(degree_2_original_model,"Original Quadratic Model QQ Plot" , 4)
makeqq(degree_3_original_model,"Original Cubic Model QQ Plot" , 6)
Define Homoscedasticity Function
makehomosced <- function(theSecondOutput,trainingData = trainingData, titleVal, supressCol = F, supressRow = F, evenPow = F, colNum)
{
plot(trainingData$MEDV, theSecondOutput$residuals,
ann = F,
axes = F,
col = qqColT[colNum],
lwd = 4.5,
xlim = range(pretty(range(trainingData$MEDV))),
ylim = range(pretty(range(-30,40)))
)
mtext(titleVal, side = 3, las =1, adj = .1, font = 2, col = qqCol[colNum], cex = 1)
axis(2, at = pretty(range(-30,40)), lwd = 2, tck = -.0125, labels = F)
mtext(pretty(range(-30,40)),side = 2,at = pretty(range(-30,40)), line = 1, lwd = 1, las = 1)
if(!supressCol)
mtext("Residuals", side =2 , line = 2.75, font = 2)
axis(1, at = pretty(range(trainingData$MEDV)), pos = 0, lwd = 2, tck = -.0125, labels = F)
#mtext(pretty(range(trainingData$MEDV)), at = pretty(trainingData$MEDV),side =1 , line = -8)
if(!supressRow)
mtext("Median Value of Owner Occupied Homes ($1000USD)", side = 1, line = 3.0, font = 2)
}
Homoscedasticity Plots
par(mfcol = c(1,3), omi = c(.5,.5,.5,.5))
makehomosced(degree_1_original_model, trainingData, "Original Linear Model\n Homoscedasticity Plot", colNum = 2, supressRow = T)
makehomosced(degree_2_original_model, trainingData, "Original Quadratic Model\n Homoscedasticity Plot", colNum = 4, supressCol = T)
makehomosced(degree_3_original_model, trainingData, "Original Cubic Model\n Homoscedasticity Plot", colNum = 6, supressCol = T, supressRow = T )
Make Augmented Models
degree_3_augmented_model <- lm(MEDV ~ poly(LSTAT, 3) + poly(RM, 3) + poly(PTRATIO, 3) + poly(CRIM, 3), data = trainingData)
degree_2_augmented_model <- lm(MEDV ~ poly(LSTAT, 2) + poly(RM, 2) + poly(PTRATIO, 2) + poly(CRIM, 2), data = trainingData)
degree_1_augmented_model <- lm(MEDV ~ poly(LSTAT, 1) + poly(RM, 1) + poly(PTRATIO, 1) + poly(CRIM, 1), data = trainingData)
Augmented Model QQ Plots
par(mfcol = c(1,3), omi = c(.5,.5,.5,.5))
makeqq(degree_1_original_model,"Augmented Linear Model\n QQ Plot" , 8)
makeqq(degree_2_original_model,"Augmented Quadratic Model\n QQ Plot" , 10)
makeqq(degree_3_original_model,"Augmented Cubic Model\n QQ Plot" , 12)
Augmented Homoscedasticity Plots
par(mfcol = c(1,3), omi = c(.5,.5,.5,.5))
makehomosced(degree_1_original_model, trainingData, "Augmented Linear Model\n Homoscedasticity Plot", colNum = 8, supressRow = T)
makehomosced(degree_2_original_model, trainingData, "Augmented Quadratic Model\n Homoscedasticity Plot", colNum = 10, supressCol = T)
makehomosced(degree_3_original_model, trainingData, "Augmented Cubic Model\n Homoscedasticity Plot", colNum = 12, supressCol = T, supressRow = T )
Linear Model Comparison
makePredict <- function(model){
return(predict(model, newdata = testingData, type = "response"))
}
Define getAIC
getAIC <- function(ogYPrediction, nParams){
ogSSE <- sum((testingData$MEDV- ogYPrediction)**2)
ogSSR <- sum((ogYPrediction - mean(testingData$MEDV))**2)
ogR2 <- (ogSSR/(ogSSR + ogSSE))
ogAIC <- dim(testingData)[1]*(log(ogSSE/dim(testingData)[1]) + 2*nParams)
return(ogAIC)
}
define getR2
getR2 <- function(ogYPrediction){
ogSSE <- sum((testingData$MEDV- ogYPrediction)**2)
ogSSR <- sum((ogYPrediction - mean(testingData$MEDV))**2)
ogR2 <- (ogSSR/(ogSSR + ogSSE))
return(ogR2)
}
Define Function to get Regression Plots with AIC and R2
makeFinalPlot <- function(ogYPrediction, sgYPrediction, colNum1, colNum2,ogR2, ogAIC, sgR2, sgAIC){
plot(ogYPrediction, testingData$MEDV,
ann=F,
axes = F,
col = qqColT[colNum1],
lwd = 3.5,
xlim = range(pretty(range(ogYPrediction))),
ylim = range(pretty(range(testingData$MEDV)))
)
ogr2text <- ((paste("R2" ,' = ', signif(ogR2,4))))
ogAICtext <- paste("AIC = ", signif(ogAIC,));
mtext(ogr2text, adj = -0.025, col = qqCol[colNum1], font = 2)
mtext(ogAICtext, adj = 0.35, col = qqCol[colNum1], font = 2)
mtext("Original Model Predictions vs Actual", side = 3, las =1, font = 2, col = qqCol[colNum1], cex = 1, line = 1)
axis(1 , at = pretty(range(ogYPrediction)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(ogYPrediction)),side = 1,at = pretty(range(ogYPrediction)), line = 1, lwd = 1, las = 1)
mtext("Predicted MEDV Values", side =1 , line = 2.75, font = 2)
axis(2 , at = pretty(range(testingData$MEDV)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(testingData$MEDV)),side = 2,at = pretty(range(testingData$MEDV)), line = 1, lwd = 1, las = 1)
mtext("Actual MEDV Values", side =2 , line = 2.75, font = 2)
plot(sgYPrediction, testingData$MEDV,
ann=F,
axes = F,
col = qqColT[colNum2],
lwd = 3.5,
xlim = range(pretty(range(sgYPrediction))),
ylim = range(pretty(range(testingData$MEDV)))
)
sgr2text <- paste("R2 = ",signif(sgR2,4))
sgAICtext <- paste("AIC = ", signif(sgAIC,4));
mtext(sgr2text, adj = -.025, col = qqCol[colNum2], font = 2)
mtext(sgAICtext, adj = .35, col = qqCol[colNum2], font = 2)
mtext("Augmented Model Predictions vs Actual", side = 3, las =1, font = 2, col = qqCol[colNum2], cex = 1, line = 1)
axis(1 , at = pretty(range(sgYPrediction)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(sgYPrediction)),side = 1,at = pretty(range(sgYPrediction)), line = 1, lwd = 1, las = 1)
mtext("Predicted MEDV Values", side =1 , line = 2.75, font = 2)
axis(2 , at = pretty(range(testingData$MEDV)), lwd = 2, tck = -.0125, labels= F)
mtext(pretty(range(testingData$MEDV)),side = 2,at = pretty(range(testingData$MEDV)), line = 1, lwd = 1, las = 1)
mtext("Actual MEDV Values", side =2 , line = 2.75, font = 2)
}
We get much better values for R2 and AIC, but a mitigating factor for this slight increase in R2 is that we have clearly failed the assumptions of Homoscedasticity and Normality of Residuals, and not to mention the assumptions of Linearity of the Predictors.
par(mfrow = c(1,2), omi = c(.25,.25,.5,.25))
og <- makePredict(degree_1_original_model)
sg <- makePredict(degree_1_augmented_model)
makeFinalPlot(og,
sg,
2,
4,
getR2(og),
getAIC(og,
nParams = 3),
getR2(sg),
getAIC(sg,
nParams = 4))
mtext("Linear Model Comparison", line = 1, outer = T, font =2)
We yield much more appealing values for both our R2 and our AIC with the quadratic regression model, while having met the assumptions for Normality of Residuals and for Homoscedasticity.
par(mfrow = c(1,2), omi = c(.25,.25,.5,.25))
og <- makePredict(degree_2_original_model)
sg <- makePredict(degree_2_augmented_model)
makeFinalPlot(og,
sg,
6,
8,
getR2(og),
getAIC(og,
nParams = 3),
getR2(sg),
getAIC(sg,
nParams = 4))
mtext("Quadratic Regression Model Comparison", line = 1, outer = T, font =2)
Our cubic model performs better than the linear model, but our AIC
scores and R2 scores are no better than those of the quadratic
regressions.
par(mfrow = c(1,2), omi = c(.25,.25,.5,.25))
og <- makePredict(degree_3_original_model)
sg <- makePredict(degree_3_augmented_model)
makeFinalPlot(og,
sg,
10,
12,
getR2(og),
getAIC(og,
nParams = 3),
getR2(sg),
getAIC(sg,
nParams = 4))
mtext("Cubic Regression Model Comparison", line = 1, outer = T, font =2)
#Below are the Summary Statistics for the Original Linear Regression
Model (Data un-transformed)
summary(degree_1_original_model)
##
## Call:
## lm(formula = MEDV ~ poly(LSTAT, 1) + poly(RM, 1) + poly(PTRATIO,
## 1), data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.133 -3.125 -0.260 1.742 61.001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7925 0.3306 68.945 < 2e-16 ***
## poly(LSTAT, 1) -31.2053 8.0953 -3.855 0.000135 ***
## poly(RM, 1) 97.3591 7.8822 12.352 < 2e-16 ***
## poly(PTRATIO, 1) -39.2704 7.1989 -5.455 8.64e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.612 on 396 degrees of freedom
## Multiple R-squared: 0.5164, Adjusted R-squared: 0.5128
## F-statistic: 141 on 3 and 396 DF, p-value: < 2.2e-16
#Below are the Summary Statistics for the Augmented Linear Regression Model (Data un-transformed with CRIM)
summary(degree_1_augmented_model)
##
## Call:
## lm(formula = MEDV ~ poly(LSTAT, 1) + poly(RM, 1) + poly(PTRATIO,
## 1) + poly(CRIM, 1), data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.937 -3.349 -0.559 1.728 54.869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7925 0.3214 70.907 < 2e-16 ***
## poly(LSTAT, 1) -18.5199 8.2885 -2.234 0.026 *
## poly(RM, 1) 98.4319 7.6672 12.838 < 2e-16 ***
## poly(PTRATIO, 1) -33.7322 7.0909 -4.757 2.76e-06 ***
## poly(CRIM, 1) -34.8373 7.1313 -4.885 1.50e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.429 on 395 degrees of freedom
## Multiple R-squared: 0.544, Adjusted R-squared: 0.5393
## F-statistic: 117.8 on 4 and 395 DF, p-value: < 2.2e-16
#Below are the Summary Statistics for the Original Quadratic Regression Model (Data un-transformed)
summary(degree_2_original_model)
##
## Call:
## lm(formula = MEDV ~ poly(LSTAT, 2) + poly(RM, 2) + poly(PTRATIO,
## 2), data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.077 -2.570 -0.478 1.915 38.673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7925 0.2483 91.801 < 2e-16 ***
## poly(LSTAT, 2)1 -61.6727 6.6764 -9.237 < 2e-16 ***
## poly(LSTAT, 2)2 81.7624 5.3128 15.390 < 2e-16 ***
## poly(RM, 2)1 60.5979 6.4448 9.403 < 2e-16 ***
## poly(RM, 2)2 33.9183 5.0735 6.685 7.92e-11 ***
## poly(PTRATIO, 2)1 -26.2679 5.4958 -4.780 2.49e-06 ***
## poly(PTRATIO, 2)2 13.1952 5.3830 2.451 0.0147 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.966 on 393 degrees of freedom
## Multiple R-squared: 0.7293, Adjusted R-squared: 0.7252
## F-statistic: 176.5 on 6 and 393 DF, p-value: < 2.2e-16
#Below are the Summary Statistics for the Augmented Quadratic Regression Model (Data un-transformed with CRIM)
summary(degree_2_augmented_model)
##
## Call:
## lm(formula = MEDV ~ poly(LSTAT, 2) + poly(RM, 2) + poly(PTRATIO,
## 2) + poly(CRIM, 2), data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.238 -2.475 -0.557 1.782 37.012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7925 0.2417 94.305 < 2e-16 ***
## poly(LSTAT, 2)1 -48.7887 7.1474 -6.826 3.34e-11 ***
## poly(LSTAT, 2)2 75.6829 5.3677 14.100 < 2e-16 ***
## poly(RM, 2)1 63.8003 6.3088 10.113 < 2e-16 ***
## poly(RM, 2)2 38.8969 5.0698 7.672 1.36e-13 ***
## poly(PTRATIO, 2)1 -20.9311 5.4883 -3.814 0.000159 ***
## poly(PTRATIO, 2)2 12.1180 5.2448 2.311 0.021380 *
## poly(CRIM, 2)1 -27.6435 5.6768 -4.870 1.63e-06 ***
## poly(CRIM, 2)2 6.3388 5.3271 1.190 0.234799
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.834 on 391 degrees of freedom
## Multiple R-squared: 0.7448, Adjusted R-squared: 0.7396
## F-statistic: 142.6 on 8 and 391 DF, p-value: < 2.2e-16
#Below are the Summary Statistics for the Original Cubic Regression Model (Data un-transformed)
summary(degree_3_original_model)
##
## Call:
## lm(formula = MEDV ~ poly(LSTAT, 3) + poly(RM, 3) + poly(PTRATIO,
## 3), data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.695 -2.470 -0.484 2.015 34.245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7925 0.2458 92.732 < 2e-16 ***
## poly(LSTAT, 3)1 -60.2979 6.8888 -8.753 < 2e-16 ***
## poly(LSTAT, 3)2 74.3850 5.7319 12.977 < 2e-16 ***
## poly(LSTAT, 3)3 7.5780 5.5079 1.376 0.16966
## poly(RM, 3)1 64.5206 6.6315 9.729 < 2e-16 ***
## poly(RM, 3)2 39.2779 5.4852 7.161 4.02e-12 ***
## poly(RM, 3)3 -5.0234 5.1849 -0.969 0.33321
## poly(PTRATIO, 3)1 -26.6490 5.4790 -4.864 1.67e-06 ***
## poly(PTRATIO, 3)2 12.5147 5.3947 2.320 0.02087 *
## poly(PTRATIO, 3)3 16.1943 5.3262 3.040 0.00252 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.916 on 390 degrees of freedom
## Multiple R-squared: 0.7367, Adjusted R-squared: 0.7307
## F-statistic: 121.3 on 9 and 390 DF, p-value: < 2.2e-16
#Below are the Summary Statistics for the Augmented Cubic Regression Model (Data un-transformed with CRIM)
summary(degree_3_augmented_model)
##
## Call:
## lm(formula = MEDV ~ poly(LSTAT, 3) + poly(RM, 3) + poly(PTRATIO,
## 3) + poly(CRIM, 3), data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.310 -2.389 -0.599 1.883 34.208
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7925 0.2407 94.698 < 2e-16 ***
## poly(LSTAT, 3)1 -46.3210 7.8488 -5.902 7.87e-09 ***
## poly(LSTAT, 3)2 69.5701 5.9148 11.762 < 2e-16 ***
## poly(LSTAT, 3)3 5.3677 5.5444 0.968 0.333588
## poly(RM, 3)1 68.0312 6.7365 10.099 < 2e-16 ***
## poly(RM, 3)2 42.1408 5.4266 7.766 7.35e-14 ***
## poly(RM, 3)3 -9.0648 5.2491 -1.727 0.084982 .
## poly(PTRATIO, 3)1 -21.5415 5.5494 -3.882 0.000122 ***
## poly(PTRATIO, 3)2 11.4128 5.2885 2.158 0.031540 *
## poly(PTRATIO, 3)3 10.4413 5.4455 1.917 0.055922 .
## poly(CRIM, 3)1 -26.6186 6.0182 -4.423 1.27e-05 ***
## poly(CRIM, 3)2 5.2504 5.4886 0.957 0.339368
## poly(CRIM, 3)3 -2.5148 5.2848 -0.476 0.634438
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.814 on 387 degrees of freedom
## Multiple R-squared: 0.7495, Adjusted R-squared: 0.7417
## F-statistic: 96.49 on 12 and 387 DF, p-value: < 2.2e-16
Here, I have chosen the best model (regarding the R^2 and the AIC), Quadratic Original Model
cooks_dist_vals <- cooks.distance(degree_2_original_model)
outlier_vals <- as.integer(names(cooks_dist_vals)[(cooks_dist_vals > mean(cooks_dist_vals))])
We have the row numbers for the outliers, so we will now remove them from the Training Data and rebuild our chosen quadratic models
trainingData_sans_outliers <- trainingData[-c(outlier_vals),]
degree_2_original_model_sans_outliers <- lm(MEDV ~ poly(RM, 2) + poly(PTRATIO,2) + poly(LSTAT,2), data = trainingData_sans_outliers)
degree_2_augmented_model_sans_outliers <- lm(MEDV ~ poly(RM, 2) + poly(PTRATIO,2) + poly(LSTAT,2) + poly(CRIM,2), data = trainingData_sans_outliers)
We will now test the residuals of our quadratic models
par(mfrow = c(2,2), omi = c(.5,.5,.5,.5))
makeqq(degree_2_original_model, "Original Multiple\n Quadratic Regression Model", colNum = 09, supressRow = T)
makeqq(degree_2_original_model_sans_outliers, "Original Multiple\n Quadratic Regression Model\nOutliers Removed", colNum = 10, supressRow = T, supressCol = T)
makeqq(degree_2_augmented_model, "Augmented Multiple\n Quadratic Regression Model", colNum = 07)
makeqq(degree_2_augmented_model_sans_outliers, "Augmented Multiple\n Quadratic Regression Model\nOutliers Removed", colNum = 08, supressCol = T)
par(mfrow = c(2,2), omi = c(.5,.5,.5,.5))
makehomosced(degree_2_original_model, trainingData , "Original Multiple\n Quadratic Regression Model", colNum = 09, supressRow = T)
makehomosced(degree_2_original_model_sans_outliers, trainingData_sans_outliers, "Original Multiple\n Quadratic Regression Model\nOutliers Removed", colNum = 10, supressRow = T, supressCol = T)
makehomosced(degree_2_augmented_model, trainingData, "Augmented Multiple\n Quadratic Regression Model", colNum = 07)
makehomosced(degree_2_augmented_model_sans_outliers, trainingData_sans_outliers, "Augmented Multiple\n Quadratic Regression Model\nOutliers Removed", colNum = 08, supressCol = T)
Summary Reports We plot with Outliers Included
par(mfrow = c(1,2), omi = c(.5,.5,.5,.5))
ogPredict <- makePredict(degree_2_original_model)
sgPredict <- makePredict(degree_2_augmented_model)
makeFinalPlot(ogPredict,
sgPredict,
ogR2 = getR2(ogPredict),
ogAIC = getAIC(ogPredict, 3),
sgR2 = getR2(sgPredict),
sgAIC = getAIC(sgPredict,4),
colNum1 = 1,
colNum2 = 3)
mtext("Multiple Quadratic Regression Plot", outer = T, font = 2, line= 1)
We plot with Outliers Removed
par(mfrow = c(1,2), omi = c(.5,.5,.5,.5))
ogPredict <- makePredict(degree_2_original_model_sans_outliers)
sgPredict <- makePredict(degree_2_augmented_model_sans_outliers)
makeFinalPlot(ogPredict,
sgPredict,
ogR2 = getR2(ogPredict),
ogAIC = getAIC(ogPredict, 3),
sgR2 = getR2(sgPredict),
sgAIC = getAIC(sgPredict,4),
colNum1 = 2,
colNum2 = 4)
mtext("Multiple Quadratic Regression Plot - Outliers Removed", outer = T, font = 2, line = 1)
Removal of Outliers makes little difference in Quadratic Regression
Model